neon.data.list <- neonUtilities::loadByProduct(
dpID = "DP1.20107.001",
site = c(c('COMO','LECO')),
startdate = "2016-01",
enddate = "2018-11",
token = Sys.getenv("NEON_TOKEN"),
check.size = FALSE)
neon.data.product.id = "DP1.20107.001"
my_token <- Sys.getenv("NEON_TOKEN")
map_neon.ecocomdp.20107.001.001(
neon.data.list = neon.data.list,
token = Sys.getenv("NEON_TOKEN")
)
##############################################################################################
##############################################################################################
# @describeIn map_neon_data_to_ecocomDP This method will retrieve count data for FISH taxa from neon.data.product.id DP1.20107.001 from the NEON data portal and map to the ecocomDP format
##############################################################################################
# mapping function for FISH
map_neon.ecocomdp.20107.001.001 <- function(
neon.data.list,
neon.data.product.id = "DP1.20107.001",
...){
#NEON target taxon group is FISH
neon_method_id <- "neon.ecocomdp.20107.001.001"
# make sure neon.data.list matches the method
if(!any(grepl(
neon.data.product.id %>% gsub("^DP1\\.","",.) %>% gsub("\\.001$","",.),
names(neon.data.list)))) stop(
"This dataset does not appeaer to be sourced from NEON ",
neon.data.product.id,
" and cannot be mapped using method ",
neon_method_id)
# get token from dots if available
my_dots <- list(...)
if("token" %in% names(my_dots)){
my_token <- my_dots$token
}else{
my_token <- NA
}
# get fish taxon table ----
# get taxon table from API, may take a few minutes to load
# Fish electrofishing, gill netting, and fyke netting counts
# http://data.neonscience.org/data-product-view?dpCode=DP1.20107.001
full_taxon_fish <- neonOS::getTaxonList(
taxonType = "FISH", token = my_token)
# -- make ordered taxon_rank list for a reference (subspecies is smallest rank, kingdom is largest)
# a much simple table with useful levels of taxonomic resolution;
# this might not be needed if taxon rank is extracted from scientific names using stringr functions
taxon_rank_fish <- c('superclass', 'class', 'subclass', 'infraclass', 'superorder',
'order', 'suborder', 'infraorder', 'section', 'subsection',
'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus',
'subgenus','speciesGroup','species','subspecies') %>% rev() # get the reversed version
# get data FISH data from NEON api
all_fish <- neon.data.list
# The object returned by loadByProduct() is a named list of data frames.
# To work with each of them, select them from the list using the $ operator.
# this joins reach-level data, you can just join by any two of these, and the results are the same, dropping the unique ID
# join field data table with the pass tables from all_fish
fsh_dat1 <- dplyr::left_join(
all_fish$fsh_perPass,
all_fish$fsh_fieldData,
by = c("reachID", "siteID"),
suffix = c("_perPass","_filedData"),
multiple = "all") %>%
dplyr::filter(is.na(samplingImpractical) | samplingImpractical == "") %>% #remove records where fish couldn't be collected, both na's and blank data
tidyr::as_tibble()
# get rid of dupe col names and .x suffix
fsh_dat1 <- fsh_dat1[, !grepl("_filedData", names(fsh_dat1))]
names(fsh_dat1) <- gsub("_perPass", "", names(fsh_dat1))
# add individual fish counts; perFish data = individual level per each specimen captured
# this joins reach-level field data with individual fish measures
fsh_dat_indiv <- dplyr::left_join(
all_fish$fsh_perFish,
fsh_dat1,
by = "eventID",
multiple = "all") %>%
tidyr::as_tibble()
# get rid of dupe col names and .x suffix
fsh_dat_indiv <- fsh_dat_indiv[, !grepl('\\.y', names(fsh_dat_indiv))]
names(fsh_dat_indiv) <- gsub('\\.x', '', names(fsh_dat_indiv))
# fill in missing reachID from event ID
fsh_dat_indiv$reachID <- ifelse(is.na(fsh_dat_indiv$reachID),
substr(fsh_dat_indiv$eventID, 1, 16),
fsh_dat_indiv$reachID)
# add bulk fish counts; bulk count data = The number of fish counted during bulk processing in each pass
# this will join all the reach-level field data with the species data with bulk counts
fsh_dat_bulk <- dplyr::left_join(
all_fish$fsh_bulkCount,
fsh_dat1, by = "eventID") %>%
tidyr::as_tibble()
# get rid of dupe col names and .x suffix
fsh_dat_bulk <- fsh_dat_bulk[, !grepl('\\.y', names(fsh_dat_bulk))]
names(fsh_dat_bulk) <- gsub('\\.x', '', names(fsh_dat_bulk))
#fill in missing reachID
fsh_dat_bulk$reachID <- ifelse(is.na(fsh_dat_bulk$reachID),
substr(fsh_dat_bulk$eventID, 1, 16),
fsh_dat_bulk$reachID)
# add taxonRank into fsh_dat from full_taxon_fish
# join the above with bulkCount dataset
fsh_dat_bulk <- dplyr::left_join(
fsh_dat_bulk,
dplyr::select(
full_taxon_fish, taxonID, scientificName, taxonRank),
by = c("taxonID", "scientificName"))
# combine indiv and bulk counts
fsh_dat <- dplyr::bind_rows(fsh_dat_indiv, fsh_dat_bulk)
# add count = 1 for indiv data
# before row_bind, the indiv dataset did not have a col for count
# after bind, the bulk count col has "NAs" need to add "1", since indiv col has individual fish per row
fsh_dat$count <- ifelse(is.na(fsh_dat$bulkFishCount), 1, fsh_dat$bulkFishCount)
# get all records that have rank upto subspecies, variable taxonomic resolution
fsh_dat_fine <- dplyr::filter(
fsh_dat,
taxonRank %in% c("class", "family", "genus", "order", "phylum", "species", "subgenus", "subspecies"))
# select passes with no fish were caught
no_fsh <- dplyr::filter(all_fish$fsh_perPass, targetTaxaPresent == "N")
# all records from fieldData where sampling was done, where sampling was not impractical
fsh_sampled <- dplyr::filter(all_fish$fsh_fieldData, is.na(samplingImpractical) | samplingImpractical == "")
#join the no-fish dataset with the field dataset where sampling was done
no_fsh2 <- dplyr::left_join(x = no_fsh, y = fsh_sampled, by = c("reachID", "siteID", "domainID", "namedLocation"))
# get rid of dupe col names and .x suffix
no_fsh2 <- no_fsh2[, !grepl('\\.y', names(no_fsh2))]
names(no_fsh2) <- gsub('\\.x', '', names(no_fsh2))
# fill in missing reachID from event ID
no_fsh2$reachID <- ifelse(is.na(no_fsh2$reachID),
substr(no_fsh2$eventID, 1, 16),
no_fsh2$reachID)
# add a count variables, which should be zero since these are the reaches with zero fish captures
no_fsh2 <- no_fsh2 %>% dplyr::mutate(count = rep_len(0, length.out = nrow(no_fsh2)),
scientificName = NA, taxonID = NA) %>%
dplyr::mutate( dplyr::across(c("scientificName", "taxonID"), ~as.character(.)))
# join the no_fish sampling sessions to the sampling sessions with fish
if(nrow(no_fsh2) > 0) fsh_dat_fine <- dplyr::bind_rows(fsh_dat_fine, no_fsh2)
# need to convert POSIXct format into as.character and then back to date-time format
# then, fill in missing site ID info and missing startDate into
fsh_dat_fine$startDate <- dplyr::if_else(is.na(fsh_dat_fine$startDate),
lubridate::as_datetime(substr(as.character(fsh_dat_fine$passStartTime), 1, 10)), fsh_dat_fine$startDate) # 1-10: number of characters on date
fsh_dat_fine$siteID <- dplyr::if_else(is.na(fsh_dat_fine$siteID),
substr(fsh_dat_fine$eventID, 1, 4), fsh_dat_fine$siteID) # four characters on site
# grouping vars for aggregating density measurements
my_grouping_vars <- c('domainID','siteID','aquaticSiteType','namedLocation',
'startDate', 'endDate', 'reachID','eventID','samplerType',
'aquaticSiteType', 'netSetTime', 'netEndTime',
'netDeploymentTime', 'netLength', 'netDepth', 'fixedRandomReach',
'measuredReachLength','efTime', 'efTime2',
"passStartTime", "passEndTime", 'netDeploymentTime',
'scientificName', 'taxonID', 'passNumber', 'taxonRank', 'targetTaxaPresent')
my_grouping_vars <- dplyr::intersect(
my_grouping_vars,
names(fsh_dat_fine))
# added a few metrics to quantify catch per unit effort such as 'passNumber', efish time, net deployment time
# aggregate densities for each species group, pull out year and month from StartDate
fsh_dat_aggregate <- fsh_dat_fine %>%
dplyr::select(dplyr::all_of(c(my_grouping_vars, 'count'))) %>%
dplyr::group_by_at(dplyr::vars(dplyr::all_of(my_grouping_vars))) %>%
dplyr::summarize(
number_of_fish = sum(count),
n_obs = dplyr::n()) %>%
dplyr::mutate(
year = startDate %>% lubridate::year(),
month = startDate %>% lubridate::month()
) %>% dplyr::ungroup()
# some aquaticSiteType are NA, replace NAs if-based wildcarding namedLocation
# grepl is for wildcarding
fsh_dat_aggregate$aquaticSiteType <- dplyr::if_else(
is.na(fsh_dat_aggregate$aquaticSiteType),
dplyr::if_else(
grepl("fish.point", fsh_dat_aggregate$namedLocation), 'stream','lake'),
fsh_dat_aggregate$aquaticSiteType)
# sampler type is also missing in a few cases, reaplace from eventID, with a wildcard
fsh_dat_aggregate$samplerType <- dplyr::if_else(
is.na(fsh_dat_aggregate$samplerType),
dplyr::if_else(grepl("e-fisher", fsh_dat_aggregate$eventID), 'electrofisher',
dplyr::if_else(grepl("gill", fsh_dat_aggregate$eventID), 'gill net', 'mini-fyke net')),
fsh_dat_aggregate$samplerType)
# make sure that e-fish samples have "" or NA for netset and netend times, this will leave actual missing data as na
# change netset/netend times to a datetime format if needed: lubridate::as_datetime(fsh_xx$netSetTime, format="%Y-%m-%d T %H:%M", tz="GMT")
## then calculate the netdeploymenttime (in hours) from netset and netend time
# to calculate pass duration (in mins) and also calculate average efish time (secs); also if efishtime is zero, --> na's
data_fish <-
fsh_dat_aggregate %>%
dplyr::mutate(
netSetTime = dplyr::if_else(
condition = samplerType == "electrofisher" | samplerType == "two electrofishers",
# true = lubridate::as_datetime(NA), false = netSetTime),
true = NA_character_, false = as.character(netSetTime)),
netEndTime = dplyr::if_else(
condition = samplerType == "electrofisher" | samplerType == "two electrofishers",
# true = lubridate::as_datetime(NA), false = netEndTime),
true = NA_character_, false = as.character(netEndTime)),
netDeploymentTime = dplyr::case_when(
samplerType == grepl("electrofish", samplerType) ~ netDeploymentTime,
is.na(netDeploymentTime) ~ as.numeric(difftime(netEndTime, netSetTime, tz = "GMT", units = "hours")),
TRUE ~ netDeploymentTime),
mean_efishtime = base::rowMeans(dplyr::select(., c("efTime", "efTime2")), na.rm = T),
mean_efishtime = dplyr::case_when(mean_efishtime == 0 ~ NA_real_,TRUE ~ mean_efishtime))
# with the above changes, we have efish time and and net duration to calculate
# catch per unit effort before moving to wide format
# CPUE with efish time, calculated by = (total number of fish/average e-fish time
# in secs * 3600) as fish captured per 1-hr of e-fishing
# CPUE with gill nets and fyke nets calculated by = (total number of fish/netDeployment time in hours * 24)
# as fish captured per 1-day-long net deployment of e-fishing
data_fish <- data_fish %>%
dplyr::mutate(
catch_per_effort = dplyr::if_else(
condition = samplerType == "electrofisher" | samplerType == "two electrofishers",
true = number_of_fish/mean_efishtime * 3600,
false = dplyr::if_else(condition = samplerType == "mini-fyke net" | samplerType == "gill net",
true = number_of_fish/netDeploymentTime * 24, false = as.numeric(NA))))
# make wide for total observations without catch per unit efforts
fsh_dat_wide_total.obs <- data_fish %>%
dplyr::group_by(
year, month, siteID, namedLocation, reachID, fixedRandomReach,
aquaticSiteType, samplerType, targetTaxaPresent ) %>%
tidyr::pivot_wider(
names_from = scientificName,
values_from = number_of_fish,
names_repair = "unique",
values_fill = 0,
names_sep = "_") %>%
dplyr::select(-n_obs, -catch_per_effort) # these two cols are misleading in this wide format
data_fish = dplyr::left_join(
data_fish,
unique(dplyr::select(
all_fish$fsh_fieldData, namedLocation, decimalLatitude,
decimalLongitude, coordinateUncertainty, geodeticDatum,
elevation, elevationUncertainty)),
by = "namedLocation")
data_fish = dplyr::filter(data_fish, !is.na(taxonID)) %>%
dplyr::distinct()
#location ----
table_location_raw <- data_fish %>%
dplyr::select(domainID, siteID, namedLocation,
decimalLatitude, decimalLongitude, elevation,
geodeticDatum, aquaticSiteType) %>%
dplyr::distinct()
table_location <- make_neon_location_table(
loc_info = table_location_raw,
loc_col_names = c("domainID", "siteID", "namedLocation"))
table_location_ancillary <- make_neon_ancillary_location_table(
loc_info = table_location_raw,
loc_col_names = c("domainID", "siteID", "namedLocation"),
ancillary_var_names = c("namedLocation", "aquaticSiteType", "geodeticDatum"))
# observation ----
my_package_id <- paste0(
neon_method_id, ".", format(Sys.time(), "%Y%m%d%H%M%S"))
table_observation_wide_all <- data_fish %>%
dplyr::left_join(
all_fish$fsh_perPass %>%
dplyr::select(
eventID,
publicationDate, release) %>%
dplyr::distinct(),
by = "eventID"
) %>%
# dplyr::rename(location_id, plotID, trapID) %>%
dplyr::rename(location_id = namedLocation) %>%
# package id
dplyr::mutate(
package_id = my_package_id) %>%
dplyr::rename(
# neon_event_id = eventID,
datetime = startDate,
taxon_id = taxonID,
value = catch_per_effort) %>%
dplyr::mutate(
observation_id = paste0("obs_",1:nrow(.)),
event_id = eventID,
# event_id = observation_id,
# neon_event_id = eventID,
variable_name = "abundance",
unit = "catch per unit effort")
table_observation <- table_observation_wide_all %>%
dplyr::select(
observation_id,
event_id,
package_id,
location_id,
datetime,
taxon_id,
variable_name,
value,
unit) %>%
dplyr::filter(!is.na(taxon_id))
table_observation_ancillary <- make_neon_ancillary_observation_table(
obs_wide = table_observation_wide_all,
ancillary_var_names = c(
"observation_id",
# "neon_event_id",
"reachID",
"samplerType",
"habitatType",
"subdominantHabitatType",
"waterTemp",
"dissolvedOxygen",
"specificConductance",
"netSetTime",
"netEndTime",
"netDeploymentTime",
"netLength",
"netDepth",
"fixedRandomReach",
"measuredReachLength",
"efTime",
"efTime2",
"passStartTime",
"passEndTime",
"mean_efishtime",
"remarks",
"release",
"publicationDate"))
# taxon ----
table_taxon <- full_taxon_fish %>%
dplyr::select(taxonID, taxonRank, scientificName, nameAccordingToID) %>%
dplyr::filter(taxonID %in% data_fish$taxonID) %>%
dplyr::distinct() %>%
dplyr::rename(taxon_id = taxonID,
taxon_rank = taxonRank,
taxon_name = scientificName,
authority_system = nameAccordingToID) %>%
dplyr::select(taxon_id,
taxon_rank,
taxon_name,
authority_system)
# data summary ----
# make dataset_summary -- required table
years_in_data <- table_observation$datetime %>% lubridate::year()
# years_in_data %>% ordered()
table_dataset_summary <- data.frame(
package_id = table_observation$package_id[1],
original_package_id = neon.data.product.id,
length_of_survey_years = max(years_in_data) - min(years_in_data) + 1,
number_of_years_sampled = years_in_data %>% unique() %>% length(),
std_dev_interval_betw_years = years_in_data %>%
unique() %>% sort() %>% diff() %>% stats::sd(),
max_num_taxa = table_taxon$taxon_id %>% unique() %>% length()
)
# return tables ----
out_list <- list(
location = table_location,
location_ancillary = table_location_ancillary,
taxon = table_taxon,
observation = table_observation,
observation_ancillary = table_observation_ancillary,
dataset_summary = table_dataset_summary)
return(out_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.